For the purpose of this analysis, we define success in the following way:
Successful companies are those that have secured at least one funding round exceeding 20 million dollars.
In contrast, unsuccessful companies are identified as those that have either shut down or have been acquired as a means of survival.
# Libraries to use. We load them quietly
packages <- c("tidyverse", "googlesheets4", "glmnet", "caret", "ggthemes", "ggpubr", "stargazer",
"psych","summarytools","kableExtra","scales","reshape2","car", "forcats", "caTools",
"tidymodels","pROC", "ModelMetrics","oddsratio","epiDisplay","gridExtra")
invisible(lapply(packages, function(x) {
suppressPackageStartupMessages(library(x, character.only = TRUE, quietly = TRUE))}))
raw_data_vc <- sheet_data_invest %>%
#Rename Variables
rename(company= "Company",
vertical= "Vertical General",
status = "Status",
hq = "HQ Company",
deal_type = "Deal Type",
team_composition = "Composición del equipo",
founding_year ="Founding year (company)",
funding_year = "Funding Year",
funding_usd = "USD",
employees = "Employees",
total_raised= "Total Raised",
funding_date = "Funding Date",
n_investors = "Number of investors",
clasf_unicorn= `New clasf unicorn`,
exit_date = `Exit year`,
team = `Composición del equipo`,
hq_inv = `Headquarters of Lead Investor`) %>%
# Group by company name
group_by(company) %>%
# Mutate different variables. Here I'm correcting some data capture errors.
mutate(funding_date = as.Date(funding_date, format = "%Y-%m-%d"),
founding_year =as.Date(paste(founding_year, "-01-01", sep = ""), format = "%Y-%m-%d"),
exit_date =as.Date(paste(exit_date, "-01-01", sep = ""), format = "%Y-%m-%d"),
end_date = as.Date(ifelse(status == "Closed", as.character(max(funding_date)),
ifelse(status == "Acquired", as.character(exit_date), NA)), format = "%Y-%m-%d"),
vertical = ifelse(vertical %in% c("Insurtech", "InsurTech"), "Insurtech", vertical),
deal_type = ifelse(deal_type %in% c("Line of Credit", "Line of credit"), "Line of credit", deal_type),
team = factor(ifelse(team %in% c("Equipo femenino", "Equipo Femenino"), "Equipo femenino",
ifelse(team %in% c("Equipo Mixto","Equipo mixto"),"Equipo mixto",team)),
levels = c("Equipo femenino","Equipo masculino","Equipo mixto",
"Un solo hombre","Una sola mujer")),
status = factor(ifelse(status %in% c("acquired", "Acquired"), "Acquired", status),
levels = c("Acquired","Closed","Operating")),
clasf_unicorn = factor(ifelse(clasf_unicorn %in% c("Was acquired","Was Acquired"), "Was Acquired", clasf_unicorn),
levels = c("IPO","Rest of Companies","Soonicorn","Unicorn","Was Acquired")),
# Calculate new variables based on existing ones. For example, get median funding by company
median_funding = median(funding_usd, na.rm= T),
total_investors = sum(n_investors),
funding_rounds = n(),
foreign_funding_all = if_else(hq_inv== hq,"No","Yes"),
foreign_funding_all = factor(foreign_funding_all,levels = c("No","Yes")),
foreign_funding = ifelse(
hq != hq_inv &
!(hq_inv %in% c("Argentina", "Brazil", "Chile",
"Colombia", "Ecuador", "Mexico",
"Peru", "Venezuela")),
"Yes",
"No"),
same_country_funding = ifelse(hq==hq_inv,"Yes","No"),
regional_funding = ifelse(hq %in% c("Argentina", "Brazil", "Chile",
"Colombia", "Ecuador", "Mexico",
"Peru", "Venezuela") &
hq_inv %in% c("Argentina", "Brazil",
"Chile", "Colombia", "Ecuador",
"Mexico", "Peru", "Venezuela"), "Yes", "No"),
# Here I'm manually correcting some data capture errors
first_funding_round = case_when(
company == "Brex" ~ as.Date("2019-04-16", format = "%Y-%m-%d"),
company == "Hackmetrix" ~ as.Date("2020-07-01", format = "%Y-%m-%d"),
company == "Hash (Financial Software)" ~ as.Date("2017-01-05", format = "%Y-%m-%d"),
company == "Inventa (B2B Marketplace)" ~ as.Date("2021-08-17", format = "%Y-%m-%d"),
company == "Kavak" ~ as.Date("2016-08-01", format = "%Y-%m-%d"),
company == "Nexu" ~ as.Date("2014-01-01", format = "%Y-%m-%d"),
company == "Solfacil" ~ as.Date("2018-01-01", format = "%Y-%m-%d"),
TRUE ~ min(funding_date)),
last_funding_round = case_when(
company == "Brex" ~ as.Date("2019-12-11", format = "%Y-%m-%d"),
company == "Hackmetrix" ~ as.Date("2021-11-28", format = "%Y-%m-%d"),
company == "Hash (Financial Software)" ~ as.Date("2021-10-20", format = "%Y-%m-%d"),
company == "Inventa (B2B Marketplace)" ~ as.Date("2022-01-23", format = "%Y-%m-%d"),
company == "Kavak" ~ as.Date("2022-09-20", format = "%Y-%m-%d"),
company == "Nexu" ~ as.Date("2022-01-26", format = "%Y-%m-%d"),
company == "Solfacil" ~ as.Date("2022-09-22", format = "%Y-%m-%d"),
TRUE ~ max(funding_date)),
months_to_first_round = as.numeric(round(difftime(first_funding_round,founding_year, units = "days"))/30, digits =1),
months_between_first_and_last_round = as.numeric(round(difftime(last_funding_round,
first_funding_round, units = "days"))/30, digits=1),
months_between_rounds = c(NA, diff(as.numeric(difftime(funding_date, lag(funding_date), units = "days"))/30)),
team = case_when(
team == "Equipo femenino" ~ "Co-Founder Team",
team == "Equipo masculino" ~ "Co-Founder Team",
team == "Equipo mixto" ~ "Co-Founder Team",
team == "Un solo hombre" ~ "Solo Founder",
team == "Una sola mujer" ~ "Solo Founder",
TRUE ~ team),
team = factor(team,levels = c("Co-Founder Team","Solo Founder")),
avg_months_between_rounds = mean(months_between_rounds, na.rm = TRUE)) %>%
# Filter out NAs and absurd variables, such as a company having 0 investors or negative total raised
filter(complete.cases(funding_usd,total_raised, n_investors, founding_year, employees, funding_date),
!grepl("Undisclosed - VC", deal_type),
n_investors >0,
total_raised >0,
as.numeric(format(founding_year, "%Y")) >= 2013) %>%
# Company classification starts from this point.
# Companies classified as Unicorns or Soonicorns are considered to be those who have gotten a +20M USD Funding round
# I'm only considering companies that closed or were acquired from 2018 to beyond.
filter(clasf_unicorn%in% c("Unicorn","Soonicorn")|status%in% c("Closed","Acquired")
& as.numeric(format(end_date, "%Y")) >= 2018) %>%
# Successful companies will be those with a Unicorn or Soonicorn status
# Not Successful companies will be those that were closed or acquired
mutate(success = ifelse(clasf_unicorn %in% c("Unicorn", "Soonicorn"), "Successful",
ifelse(status%in% c("Closed","Acquired"),"Not successful",NA)),
success = ifelse(company %in% c("Auth0", "Checkars", "Cornershop","Daki"), "Successful", success),
success = factor(success,levels = c("Successful","Not successful"))) %>%
dplyr::select(company,vertical, hq, hq_inv,success, founding_year,end_date,foreign_funding_all, same_country_funding,
median_funding, funding_usd,
regional_funding, employees, team, months_to_first_round,
months_between_first_and_last_round, avg_months_between_rounds,
total_raised, funding_rounds, n_investors) %>%
distinct(company, .keep_all = T)
raw_data_scraper <- sheet_data_scraper %>%
# Rename Variables
rename(company = `Nombre de la compañía del mapeo`,
exp_years = "Años de Experiencia",
entrep_exp_years = "Años de experiencia como emprendedor",
age = "Edad Actual",
entrep_age= "Edad de emprendimiento",
n_entrep_founded = "Numero de emprendimientos Fundados",
hundred_employees = "Worked on a +100 Company",
status = "Status Unicorn (1=unicorn, 3=soonicorn, 0=other companies)",
foreign_exp = "Foreign exp",
cofounders = "No. de Cofounders",
founders = "# founders",
employments_before_entrep= "Empleos previos a emprender",
business = Business,
entrep = Entrepreneurship,
finance = Financial,
marketing = Marketing,
maths_eng = `Maths & Engineering`,
pmanag= `Project Management`,
research = Research,
sales = Sales,
soft = `Soft Skills`,
consulting_exp = Consulting_exp,
founder_exp = Founder_exp,
clevel_exp= cLevel_exp,
field_of_study = `Undergraduate field of study`,
exp_years_categ = `Categorización Años de experiencia`,
foreign_exp = "Foreign exp") %>%
group_by(company) %>%
dplyr::select(company,exp_years, exp_years_categ,cofounders,
entrep_exp_years,employments_before_entrep,
age,entrep_age,n_entrep_founded,hundred_employees,
business,entrep,finance,marketing,maths_eng,pmanag,
research,sales,soft,manager_exp,clevel_exp,engineering_exp,
sales_exp,finance_exp,human_capital_exp,consulting_exp,founder_exp, field_of_study, foreign_exp) %>%
# Mutate and create new variables by company
mutate(cofounders = dplyr::recode(cofounders,
"2 Cofounders" = 2,
"3 Cofounders" = 3,
"4 Cofounders" = 4,
"5 o más Cofounders" = 5,
"Single founder" = 1),
exp_years = sum(exp_years),
entrep_exp_years=sum(entrep_exp_years),
employments_before_entrep = sum(employments_before_entrep),
age = median(age),
entrep_age = median(entrep_age),
across(n_entrep_founded:founder_exp,sum)) %>%
filter(exp_years >= 0) %>%
# Keep a unique observation for each company
distinct(company, .keep_all = T)
# Join founder data with the investments their company received.
joined_data_scraper <- left_join(raw_data_vc,raw_data_scraper, by = "company") %>%
mutate(cofounders = as.character(cofounders),
cofounders = if_else(cofounders %in% c("4", "5"), "4+", cofounders)) %>%
filter(!is.na(cofounders)) %>%
group_by(cofounders) %>%
mutate(team = case_when(
cofounders == 1 ~ "Solo Founder",
TRUE ~ "Co-Founder Team"))
For confidentiality reasons, I can’t provide the complete data frame.
However, I’m able to share a more condensed, high-level view of the
data:
| company | hq | total_raised | months_between_first_and_last_round | team |
|---|---|---|---|---|
| NetLex | Brazil | 53230000 | 32.06667 | Solo Founder |
| Sofía | Mexico | 31600000 | 10.43333 | Co-Founder Team |
| La Plataforma | Chile | 80000 | 19.50000 | Co-Founder Team |
| DogHero | Brazil | 12500000 | 31.10000 | Co-Founder Team |
| byprice.com | Mexico | 51760000 | 41.43333 | Co-Founder Team |
| Variable | Successful | Not successful |
|---|---|---|
| Months to first round | 17 | 19 |
| Months between first and last round | 29 | 14 |
| Funding rounds | 4 | 2 |
| Employees | 174 | 11 |
| Investors | 3 | 1 |
| Founding Year | 2017 | 2016 |
| End Year | NA | 2020 |
The table reveals some intriguing findings:
-
Successful companies typically secure their initial funding
round two months ahead of unsuccessful ones.
- Successful
companies usually have two additional funding rounds compared to
unsuccessful ones.
- Unsuccessful companies often
struggle to expand their workforce beyond 50 employees.
-
The highest number of closures among unsuccessful companies
occurred in 2020, likely influenced by the pandemic and
issues accessing funding.
Figure 3. Team Composition Distribution
- While we shouldn’t equate correlation with causation, the
graph does suggest an interesting pattern. Despite the fact that having
co-founders doesn’t assure success, among co-founder companies, only 17%
failed. While within solo founders, this percentage increases to
30% failed companies
Figure 4: Top Performing Industries
- E-commerce stands out as the most successful industry, with
84% of companies in this sector achieving success.
- Conversely, the AdTech/Media industry exhibits the highest failure
rate, as 29% of companies within this sector have either shut
down or been acquired.
Figure 5. Funding Locations
Another noteworthy observation pertains to the geographical
origin of funding that entrepreneurs secure.
- Successful
companies have 72% rate of getting a foreign funding
round.
- On the other hand, unsuccessful companies get
much more funding from local funds, with 52% receiving funds
within their same country versus 28% for successful
companies
Figure 6. Distribution of first entrepreneurship
- The graph displays the age distribution of their first venture
for both teams.
- Solo founders tend to launch their first venture
two years earlier than co-founder teams.
- The bubble size
represents the number of employees for each age.
-It’s observed
that for both teams, the longer their venture has been operating, the
more employees they manage.
Figure 7. Correlation graph
- Both groups display a positive correlation: the more
experience they have, the more investment rounds they secure.
- The
correlation value for the team of solo founders is higher (0.33). This
could suggest that solo founders with more experience may secure more
investment rounds than co-founder teams with similar years of
experience.
-The p-value is less than 0.05, indicating a
significant correlation, unlikely to be caused by chance.
Figure 8. Skills Comparison
- There’s a 31% increase in individuals with product management
skills in co-founder teams versus solo founders.
- Financial skills
come second, with a difference of 23% in co-founder teams versus solo
founders.
- Both groups excel in business operations, emphasizing
the importance of managing ventures effectively, regardless of the
number of founders.
Figure 9. Time it takes companies to move from one funding round to another
The timeframe a company takes to move forward to the next
investment stages is another key element that investors consider. The
graph shows the median duration a company typically takes to progress
from an early to a later investment stage. A few interesting
observations are:
- Successful companies usually
secure a Series A investment within a year. In contrast,
unsuccessful companies can take up to a year and a
half.
- The shift to a Series B is challenging for all
companies, but successful ones manage it in 13 months,
while unsuccessful ones can take as long as 30 months — over
twice as long.
- This trend continues into the transition
to Series C, where unsuccessful companies
require 27 months.
- Unsuccessful companies generally
fail to reach a Series D or further rounds.
-
Successful companies maintain a steady influx of investments,
with an average interval of a year between rounds.
Now, based on the previous analysis. I’m developing a logistic
regression model.
set.seed(123)
# Select all numeric variables
numeric_companies <- raw_data_vc %>%
# Mutate success as a number, where 1 = Success and 0 = Not Successful
# Mutate team as a number, where 1 = Co-Founder Team and 0 = Solo Founder
# Mutate foreign funding as a number, where 1 = Has received foreign funding and 0 = Has not received foreign funding
mutate(success= as.numeric(if_else(success == "Successful",1,0)),
team = as.numeric(if_else(team == "Co-Founder Team",1,0)),
foreign_funding_all = as.numeric(if_else(foreign_funding_all== "Yes",1,0))) %>%
ungroup() %>%
select_if(is.numeric)
# Mutate variables as factors so the model captures the levels correctly
numeric_companies$success <- as.factor(numeric_companies$success)
numeric_companies$team <- as.factor(numeric_companies$team)
numeric_companies$foreign_funding_all <- as.factor(numeric_companies$foreign_funding_all)
# Split training and testing datasets respectively
datos_split = initial_split(numeric_companies, prop = 0.7,strata = success)
datos_train = training(datos_split)
datos_test = testing(datos_split)
# Fit the logistic regression model where the variable to predict is success of a company
logit <- glm(success ~ ., data = datos_train %>% dplyr::select(c(-avg_months_between_rounds,
-funding_usd,
-median_funding)), family = "binomial")
Variable to predict will be Success of a company.
70% of the data will be assigned into training the model
The results highlight statistical significance across three
variables at a 95% confidence level.
The model’s AIC Criteria is low, indicating its
effectiveness.
While it would be ideal to observe statistical significance
across all regressors, the focus of this analysis is more on the model’s
predictive capabilities and accuracy. Therefore, this aspect doesn’t
undermine the analysis.
| Dependent variable: | |
| company_success | |
Received Foreign Funding (1=Yes)1
|
-0.15 |
| (0.61) | |
Total Employees
|
0.01** |
| (0.005) | |
Team Composition (1= Co-Founder Team)1
|
1.59** |
| (0.78) | |
Months to achieve first round
|
-0.01 |
| (0.01) | |
Months between first and last round
|
-0.01 |
| (0.02) | |
Total Raised
|
0.0000*** |
| (0.00) | |
Total funding rounds
|
0.44 |
| (0.27) | |
Total Investors
|
0.29** |
| (0.15) | |
| Constant | -3.55*** |
| (1.02) | |
| Observations | 165 |
| Log Likelihood | -38.57 |
| Akaike Inf. Crit. | 95.14 |
| Note: | p<0.1; p<0.05; p<0.01 |
| VIF Value | |
|---|---|
Received Foreign Funding (1=Yes)
|
1.112138 |
Total Employees
|
1.110112 |
Team Composition (1= Co-Founder Team)
|
1.143243 |
Months to achieve first round
|
1.170025 |
Months between first and last round
|
2.437043 |
Total Raised
|
1.106142 |
Total funding rounds
|
2.558997 |
Total Investors
|
1.042223 |
| Actual Failure | Actual Success | |
|---|---|---|
| Predicted Failure | 10 | 2 |
| Predicted Success | 4 | 57 |
| Actual Failure | Actual Success | |
|---|---|---|
| Predicted Failure | 27 | 11 |
| Predicted Success | 5 | 122 |
| Accuracy | Precision | Recall | F1_Score |
|---|---|---|---|
| 0.9178082 | 0.9661017 | 0.9344262 | 0.95 |
| Accuracy | Precision | Recall | F1_Score |
|---|---|---|---|
| 0.9030303 | 0.9172932 | 0.9606299 | 0.9384615 |
The following data frames contain real data from two companies.
Due to confidentiality obligations, I won’t disclose the names of these
companies. However, this data is crucial for making predictions about
their success.
| Total Employees | Received Foreign Funding (1=Yes) | Team Composition (1= Co-Founder Team) | Months to achieve first round | Months between first and last round | Total Raised | Total funding rounds | Total Investors |
|---|---|---|---|---|---|---|---|
| 109 | 1 | 1 | 12 | 16 | 17310000 | 3 | 4 |
| Total Employees | Received Foreign Funding (0= No) | Team Composition (0= Solo Founder) | Months to achieve first round | Months between first and last round | Total Raised | Total funding rounds | Total Investors |
|---|---|---|---|---|---|---|---|
| 26 | 1 | 0 | 24 | 36 | 10000000 | 2 | 2 |
As seen in the provided data, there are significant differences
between the two companies:
1. Company A has successfully
expanded to over 50 employees, indicating consistent growth, while
Company B hasn’t yet reached this milestone.
2. Both companies
have attracted foreign investment.
3. Company A was launched
by a team of co-founders, whereas Company B was started by a solo
entrepreneur. As previously noted, companies that end up closing or
getting acquired are often headed by a single individual.
4.
It took Company B twice as long to secure their first round of funding,
suggesting a slower initial impact in their industry.
5.
Company A has raised over 17M USD, compared to Company B 10M USD.
6. Company A has completed one additional funding round compared to
Company B.
7. Company A has attracted twice the number of
investors as Company B, which could indicate a compelling business
proposition.
| Chance of Company A being successful |
|---|
| 87% |
| Chance of Company B being successful |
|---|
| 9% |
As we can see, company A is closer to getting a 20+ million
dollars funding round. While company B is close to being acquired or
closed with just a 9% chance of being successful.
In conclusion, the developed Logistic Regression model has
proven to be a highly effective tool in assessing the performance and
potential success of Latin American startups, as indicated by its
impressive accuracy and AUC values. It has provided invaluable insights
into the distinguishing factors between successful and unsuccessful
companies, shedding light on the importance of factors like the timing
of initial funding, the number of funding rounds, and the structure of
the founding team.
Moreover, the model’s robust performance
across both training and testing datasets underlines its reliability in
real-world applications. The ability to accurately predict company
success based on specific variables is of significant value to both
investors and entrepreneurs. This model can help companies identify
areas where they are underperforming and guide them to implement
strategic changes to drive growth and success.
While the model has demonstrated robust performance, it does
have some limitations.
One of the primary constraints is that
the model relies heavily on the structure of the founding team,
particularly whether the company has a solo founder or a team of
co-founders. This may lead to some bias in the model’s predictions, as
it places significant weight on this variable, potentially overlooking
other critical factors.
Furthermore, the model is designed to
analyze a specific set of variables, which may not encompass all factors
that could influence a company’s success. Business outcomes can be
influenced by a myriad of external factors, such as changes in market
trends, economic fluctuations, and regulatory changes, which the model
may not account for.
It is important to note that the model is
based on historical data, which means its predictions are inherently
rooted in past trends. While these trends can often provide a reliable
guide, they do not guarantee future outcomes, as the business landscape
is continually evolving.
Therefore, while the model provides
valuable insights, it should not be the sole tool used in
decision-making. It’s essential to use these predictions alongside other
business analysis tools, expert opinion, and consideration of broader
market and industry trends to make well-rounded decisions.